# -*- coding: utf-8 -*-
"""
Created on Mon Nov 13 19:09:08 2017

@author: Alex
"""

import pandas as pd
import numpy as np
from io import BytesIO
from MIDAS import MIDAS
from sklearn.preprocessing import MinMaxScaler
from zipfile import ZipFile
import requests
import os

directory = 'data/WDI.csv'
if not os.path.exists(directory):
  if not os.path.exists('data/'):
    os.makedirs('data/')
  r = requests.get("http://databank.worldbank.org/data/download/WDI_csv.zip")
  with ZipFile(BytesIO(r.content)) as z:
    with z.open("WDIData.csv") as f:
      wdi_data = pd.read_csv(f, skipinitialspace=True)
      wdi_data.drop(['Country Name', 'Indicator Name', 'Unnamed: 62'],
                    axis= 1, inplace= True)
      wdi_data = wdi_data.melt(id_vars= ['Country Code', 'Indicator Code'],
                               var_name= 'Year')
      wdi_data = wdi_data.set_index(['Country Code', 'Year']).pivot(columns= 'Indicator Code')
      wdi_data.reset_index(level= ['Country Code', 'Year'], inplace= True)
      wdi_names = []
      for colname in wdi_data.columns:
        if colname[1] is not '':
          wdi_names.append(colname[1])
        else:
          wdi_names.append(colname[0])
      wdi_data.columns = wdi_names
      wdi_data.to_csv("data/WDI.csv")
else:
  wdi_data = pd.read_csv("data/WDI.csv", low_memory= False).drop("Unnamed: 0", axis= 1)

t_var = 'NY.GDP.MKTP.CD' # GDP (current US$)
t_cty = ['CMR', 'COG', 'CIV', 'GHA', 'NER', 'ZMB'] #Cameroon, Rep. Congo, Cote D'Ivor, Ghana, Mozambique, Zamibia
t_yrs = np.arange(1970, 2001)
wdi_data = wdi_data.loc[wdi_data['Country Code'].isin(t_cty)]
wdi_data = wdi_data.loc[wdi_data['Year'].isin(t_yrs)]
wdi_data.reset_index(drop= True, inplace= True)
orig = wdi_data[t_var]
country_year = pd.concat([wdi_data['Country Code'], wdi_data['Year']], axis= 1)
wdi_data.drop(['Country Code', t_var], axis= 1, inplace= True)
wdi_data.dropna(axis=1, how= 'all', inplace= True)
wdi_data = pd.concat([pd.get_dummies(country_year['Country Code']), wdi_data], axis= 1)
scaler = MinMaxScaler()
na_marker = wdi_data.isnull()
wdi_data = pd.DataFrame(scaler.fit_transform(wdi_data.fillna(wdi_data.median())),
                        columns= wdi_data.columns)
wdi_data[na_marker] = np.nan

orig_min = orig.min()
orig_max = orig.max()
orig = (orig - orig_min) / (orig_max - orig_min)

def generate_lag(target, lag = -1):
  manip = pd.Series(np.nan, index= target.index)
  for year in country_year['Year'].unique():
    if (year - lag) in country_year['Year'].unique():
      manip_idx = country_year.index[(country_year['Year'] == year)]
      target_idx = country_year.index[(country_year['Year'] == (year - lag))]
      manip[manip_idx] = target[target_idx]
  return manip

for cty in t_cty:
  output_list = []
  for year in t_yrs:
    idx = country_year.index[(country_year['Country Code'] == cty) & (country_year['Year'] == year)]
    test = orig.copy()
    test[idx] = np.nan
    ln1 = generate_lag(test, -1)
    ln2 = generate_lag(test, -2)
    ln3 = generate_lag(test, -3)
    ln4 = generate_lag(test, -4)
    ln5 = generate_lag(test, -5)
    ln6 = generate_lag(test, -6)
    ln7 = generate_lag(test, -7)
    ln8 = generate_lag(test, -8)
    ln9 = generate_lag(test, -9)
    ln10 = generate_lag(test, -10)
    lp1 = generate_lag(test, 1)
    lp2 = generate_lag(test, 2)
    lp3 = generate_lag(test, 3)
    lp4 = generate_lag(test, 4)
    lp5 = generate_lag(test, 5)
    lp6 = generate_lag(test, 6)
    lp7 = generate_lag(test, 7)
    lp8 = generate_lag(test, 8)
    lp9 = generate_lag(test, 9)
    lp10 = generate_lag(test, 10)
    add_data = pd.concat([wdi_data, ln1, ln2, ln3, ln4, ln5, ln6, ln7,
                          ln8, ln9, ln10, lp1, lp2, lp3, lp4, lp5, lp6,
                          lp7, lp8, lp9, lp10], axis= 1)
    imputer = MIDAS([1024, 512], learn_rate= 3e-5)
    imputer.build_model(pd.DataFrame(test), additional_data= add_data)

    imputer.train_model(training_epochs= 2000, verbosity_ival= 750)
    results = []
    for output in imputer.batch_yield_samples(200, 16):
      results.append((output.values[idx][0][0] * (orig_max - orig_min)) + orig_min)
    output_list.append(pd.Series(results, name= year))
  csv_path = 'wdi/' + cty +  '_output.csv'
  output = pd.concat(output_list, axis= 0)
  output.to_csv(csv_path)










